Last Update: 2019-03-11 20:09:06
Before we start, let’s load a few libraries.
rm(list = ls())
options(warn = -1)
library(knitr)
library(ggplot2)
library(caret)
library(doParallel)
registerDoParallel(cores = (detectCores() - 1))
With our libraries loaded we can start loading our data.
Let’s read in our data.
data.2015 = read.csv("data/2015.csv")
data.2016 = read.csv("data/2016.csv")
data.2017 = read.csv("data/2017.csv")
data.2018 = read.csv("data/2018.csv")
Now, we will only deal with regular season events. So let’s remove the playoffs from our datasets.
get.regular.season = function(data) {
subset(data, isPlayoffGame == 0)
}
season.2015 = get.regular.season(data.2015)
season.2016 = get.regular.season(data.2016)
season.2017 = get.regular.season(data.2017)
season.2018 = get.regular.season(data.2018)
Here is a table of all the columns we shall keep and what we shall rename them to.
| Old Column Name | New Column Name |
|---|---|
xCordAdjusted |
x |
yCordAdjusted |
y |
shotAngleAdjusted |
angle |
shotDistance |
dist |
teamCode |
team |
shotType |
type |
get.helpful.data = function(data) {
data.frame(x = data$xCordAdjusted,
y = data$yCordAdjusted,
angle = data$shotAngleAdjusted,
dist = data$shotDistance,
type = data$shotType,
typeNum = as.numeric(data$shotType),
goal = data$goal,
team = data$teamCode)
}
# type:
# 1 -> empty
# 2 -> BACK
# 3 -> DEFL
# 4 -> SLAP
# 5 -> SNAP
# 6 -> TIP
# 7 -> WRAP
# 8 -> WRIST
analysis.2015 = get.helpful.data(season.2015)
analysis.2016 = get.helpful.data(season.2016)
analysis.2017 = get.helpful.data(season.2017)
analysis.2018 = get.helpful.data(season.2018)
Now, we can remove incomplete cases and create our machine learning model’s giant data set.
analysis.2015 = analysis.2015[complete.cases(analysis.2015),]
analysis.2016 = analysis.2016[complete.cases(analysis.2016),]
analysis.2017 = analysis.2017[complete.cases(analysis.2017),]
analysis.all = rbind(analysis.2017, rbind(analysis.2016, analysis.2015))
analysis.all = analysis.all[complete.cases(analysis.all),]
analysis.2018 = analysis.2018[complete.cases(analysis.2018),]
Here’s what analysis.2018 looks like:
analysis.2018
Now, we need a few functions to help us select certain subsets of data. We’ll define three functions: get.team.data, get.shooter.data, get.goalie.data.
get.team.data = function(data, code) {
subset(data, team == code)
}
We can calculate a few statistics, like goal (effective) percentage for a certain shot. Let’s write a function to do that right now.
calculate.goal.percentage = function(data) {
goals = sum(data$goal == 1)
total = nrow(data)
goals / total
}
So, for example, Penguins’s goal percentage against slap shots would be calculated as follows:
penguins.2018 = get.team.data(analysis.2018, "PIT")
penguins.2018.slap = subset(penguins.2018, typeNum == 4)
penguins.2018.slap.eff = calculate.goal.percentage(penguins.2018.slap)
Their goal percentage is 0.0530035.
Using the caret package, we can build machine learning models to help us determine which shot type is best.
Let’s get started with training our control.
control = trainControl(method = "repeatedcv", number = 5, repeats = 3)
Now we will train a few different types of models. Here is a list of the models we will train:
model.nnet = train(goal ~ . -goal -team -type,
data = analysis.all,
method = "nnet",
trControl = control)
## # weights: 36
## initial value 34868.630462
## iter 10 value 21035.218950
## iter 20 value 19451.876327
## iter 30 value 18791.718659
## iter 40 value 18718.264585
## iter 50 value 18659.968779
## iter 60 value 18585.497607
## iter 70 value 18559.318557
## iter 80 value 18542.281988
## iter 90 value 18523.966040
## iter 100 value 18515.062866
## final value 18515.062866
## stopped after 100 iterations
model.knn = train(goal ~ . -goal -team -type,
data = analysis.all,
method = "knn",
trControl = control)
Now our models have been made.
Let’s test our models on the 2018 data. Here’s what the testing data looks like:
analysis.2018
Now, let’s get our predictions:
nnet.prediction = predict(model.nnet, newdata = analysis.2018)
knn.prediction = predict(model.knn, newdata = analysis.2018)
nnet.prediction.data = data.frame(analysis.2018)
nnet.prediction.data$predict = nnet.prediction
knn.prediction.data = data.frame(analysis.2018)
knn.prediction.data$predict = knn.prediction
So, our Neural Network data looks like:
nnet.prediction.data
Our K-Nearest Neighbors data looks like:
knn.prediction.data
Let us visualize how distance and the goal prediction are related.
make.knn.dist.plot = function(data, primary, secondary, team) {
name = paste(team, "Predicted Goal Probability versus Distance (KNN)", sep = " ")
plot = ggplot(data) +
geom_smooth(aes(x = dist, y = predict), method = "auto",
fill = secondary, color = primary) +
labs(title = name,
x = "Distance from Net",
y = "Probability of Scoring") +
theme_minimal()
plot
}
Here’s the same function, but for the neural network data.
make.nnet.dist.plot = function(data, primary, secondary, team) {
name = paste(team, "Predicted Goal Probability versus Distance (NNet)", sep = " ")
plot = ggplot(data) +
geom_smooth(aes(x = dist, y = predict), method = "auto",
fill = secondary, color = primary) +
labs(title = name,
x = "Distance from Net",
y = "Probability of Scoring") +
theme_minimal()
plot
}
Now let’s visualize the frequency of a shot.
make.type.freq.plot = function(data, primary, secondary, team) {
name = paste(team, "Frequency per Shot Type", seq = " ")
plot = ggplot(data) +
geom_bar(aes(x = type), stat = "count",
fill = secondary, color = primary) +
labs(title = name, x = "Shot Type", y = "Count") +
theme_minimal()
plot
}
The next graph is comparing type and goal prediction.
make.knn.type.plot = function(data, primary, secondary, team) {
name = paste(team, "Predicted Goal versus Shot Type (KNN)", seq = " ")
plot = ggplot(data) +
geom_jitter(aes(x = type, y = predict), fill = primary, color = secondary) +
labs(title = name, x = "Shot Type", y = "Probability of Scoring") +
theme_minimal()
plot
}
Here is the neural net flavor of the last function.
make.nnet.type.plot = function(data, primary, secondary, team) {
name = paste(team, "Predicted Goal versus Shot Type (NNet)", seq = " ")
plot = ggplot(data) +
geom_jitter(aes(x = type, y = predict), fill = primary, color = secondary) +
labs(title = name, x = "Shot Type", y = "Probability of Scoring") +
theme_minimal()
plot
}
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
pit.nnet = get.team.data(nnet.prediction.data, "PIT")
pit.knn = get.team.data(knn.prediction.data, "PIT")
Let’s just visualize our shots type data.
pit.type.freq = make.type.freq.plot(pit.nnet, "#000000", "#FCB514", "PIT")
pit.type.freq
Here is how the Penguins did with shots versus goal probability from the KNN model.
pit.knn.dist = make.knn.dist.plot(pit.knn, "#000000", "#FCB514", "PIT")
pit.knn.dist
And now with the neural network.
pit.nnet.dist = make.nnet.dist.plot(pit.nnet, "#000000", "#FCB514", "PIT")
pit.nnet.dist
Now let’s look at shot type versus goal probability.
pit.knn.type = make.knn.type.plot(pit.knn, "#000000", "#FCB514", "PIT")
pit.knn.type
And now with the neural network.
pit.nnet.type = make.nnet.type.plot(pit.nnet, "#000000", "#FCB514", "PIT")
pit.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
vgk.nnet = get.team.data(nnet.prediction.data, "VGK")
vgk.knn = get.team.data(knn.prediction.data, "VGK")
Let’s just visualize our shots type data.
vgk.type.freq = make.type.freq.plot(vgk.nnet, "#B4975A", "#333F42", "VGK")
vgk.type.freq
Here is how the Golden Knights did with shots versus goal probability from the KNN model.
vgk.knn.dist = make.knn.dist.plot(vgk.knn, "#B4975A", "#333F42", "VGK")
vgk.knn.dist
And now with the neural network.
vgk.nnet.dist = make.nnet.dist.plot(vgk.nnet, "#B4975A", "#333F42", "VGK")
vgk.nnet.dist
Now let’s look at shot type versus goal probability.
vgk.knn.type = make.knn.type.plot(vgk.knn, "#B4975A", "#333F42", "VGK")
vgk.knn.type
And now with the neural network.
vgk.nnet.type = make.nnet.type.plot(vgk.nnet, "#B4975A", "#333F42", "VGK")
vgk.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
wpg.nnet = get.team.data(nnet.prediction.data, "WPG")
wpg.knn = get.team.data(knn.prediction.data, "WPG")
Let’s just visualize our shots type data.
wpg.type.freq = make.type.freq.plot(wpg.nnet, "#041E42", "#004C97", "WPG")
wpg.type.freq
Here is how the Jets did with shots versus goal probability from the KNN model.
wpg.knn.dist = make.knn.dist.plot(wpg.knn, "#041E42", "#004C97", "WPG")
wpg.knn.dist
And now with the neural network.
wpg.nnet.dist = make.nnet.dist.plot(wpg.nnet, "#041E42", "#004C97", "WPG")
wpg.nnet.dist
Now let’s look at shot type versus goal probability.
wpg.knn.type = make.knn.type.plot(wpg.knn, "#041E42", "#004C97", "WPG")
wpg.knn.type
And now with the neural network.
wpg.nnet.type = make.nnet.type.plot(wpg.nnet, "#041E42", "#004C97", "WPG")
wpg.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
tbl.nnet = get.team.data(nnet.prediction.data, "T.B")
tbl.knn = get.team.data(knn.prediction.data, "T.B")
Let’s just visualize our shots type data.
tbl.type.freq = make.type.freq.plot(tbl.nnet, "#002868", "#AAB1BF", "T.B")
tbl.type.freq
Here is how the Lightning did with shots versus goal probability from the KNN model.
tbl.knn.dist = make.knn.dist.plot(tbl.knn, "#002868", "#AAB1BF", "T.B")
tbl.knn.dist
And now with the neural network.
tbl.nnet.dist = make.nnet.dist.plot(tbl.nnet, "#002868", "#AAB1BF", "T.B")
tbl.nnet.dist
Now let’s look at shot type versus goal probability.
tbl.knn.type = make.knn.type.plot(tbl.knn, "#002868", "#AAB1BF", "T.B")
tbl.knn.type
And now with the neural network.
tbl.nnet.type = make.nnet.type.plot(tbl.nnet, "#002868", "#AAB1BF", "T.B")
tbl.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
tor.nnet = get.team.data(nnet.prediction.data, "TOR")
tor.knn = get.team.data(knn.prediction.data, "TOR")
Let’s just visualize our shots type data.
tor.type.freq = make.type.freq.plot(tor.nnet, "#003E7E", "#D3D3D3", "TOR")
tor.type.freq
Here is how the Maple Leafs did with shots versus goal probability from the KNN model.
tor.knn.dist = make.knn.dist.plot(tor.knn, "#003E7E", "#D3D3D3", "TOR")
tor.knn.dist
And now with the neural network.
tor.nnet.dist = make.nnet.dist.plot(tor.nnet, "#003E7E", "#D3D3D3", "TOR")
tor.nnet.dist
Now let’s look at shot type versus goal probability.
tor.knn.type = make.knn.type.plot(tor.knn, "#003E7E", "#D3D3D3", "TOR")
tor.knn.type
And now with the neural network.
tor.nnet.type = make.nnet.type.plot(tor.nnet, "#003E7E", "#D3D3D3", "TOR")
tor.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
col.nnet = get.team.data(nnet.prediction.data, "COL")
col.knn = get.team.data(knn.prediction.data, "COL")
Let’s just visualize our shots type data.
col.type.freq = make.type.freq.plot(col.nnet, "#6F263D", "#236192", "COL")
col.type.freq
Here is how the Avalance did with shots versus goal probability from the KNN model.
col.knn.dist = make.knn.dist.plot(col.knn, "#6F263D", "#236192", "COL")
col.knn.dist
And now with the neural network.
col.nnet.dist = make.nnet.dist.plot(col.nnet, "#6F263D", "#236192", "COL")
col.nnet.dist
Now let’s look at shot type versus goal probability.
col.knn.type = make.knn.type.plot(col.knn, "#6F263D", "#236192", "COL")
col.knn.type
And now with the neural network.
col.nnet.type = make.nnet.type.plot(col.nnet, "#6F263D", "#236192", "COL")
col.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
cgy.nnet = get.team.data(nnet.prediction.data, "CGY")
cgy.knn = get.team.data(knn.prediction.data, "CGY")
Let’s just visualize our shots type data.
cgy.type.freq = make.type.freq.plot(cgy.nnet, "#C8102E", "#F1BE48", "CGY")
cgy.type.freq
Here is how the Flames did with shots versus goal probability from the KNN model.
cgy.knn.dist = make.knn.dist.plot(cgy.knn, "#C8102E", "#F1BE48", "CGY")
cgy.knn.dist
And now with the neural network.
cgy.nnet.dist = make.nnet.dist.plot(cgy.nnet, "#C8102E", "#F1BE48", "CGY")
cgy.nnet.dist
Now let’s look at shot type versus goal probability.
cgy.knn.type = make.knn.type.plot(cgy.knn, "#C8102E", "#F1BE48", "CGY")
cgy.knn.type
And now with the neural network.
cgy.nnet.type = make.nnet.type.plot(cgy.nnet, "#C8102E", "#F1BE48", "CGY")
cgy.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
nyi.nnet = get.team.data(nnet.prediction.data, "NYI")
nyi.knn = get.team.data(knn.prediction.data, "NYI")
Let’s just visualize our shots type data.
nyi.type.freq = make.type.freq.plot(nyi.nnet, "#00539B", "#F47D30", "NYI")
nyi.type.freq
Here is how the Islanders did with shots versus goal probability from the KNN model.
nyi.knn.dist = make.knn.dist.plot(nyi.knn, "#00539B", "#F47D30", "NYI")
nyi.knn.dist
And now with the neural network.
nyi.nnet.dist = make.nnet.dist.plot(nyi.nnet, "#00539B", "#F47D30", "NYI")
nyi.nnet.dist
Now let’s look at shot type versus goal probability.
nyi.knn.type = make.knn.type.plot(nyi.knn, "#00539B", "#F47D30", "NYI")
nyi.knn.type
And now with the neural network.
nyi.nnet.type = make.nnet.type.plot(nyi.nnet, "#00539B", "#F47D30", "NYI")
nyi.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
wsh.nnet = get.team.data(nnet.prediction.data, "WSH")
wsh.knn = get.team.data(knn.prediction.data, "WSH")
Let’s just visualize our shots type data.
wsh.type.freq = make.type.freq.plot(wsh.nnet, "#041E42", "#C8102E", "WSH")
wsh.type.freq
Here is how the Cawshals did with shots versus goal probability from the KNN model.
wsh.knn.dist = make.knn.dist.plot(wsh.knn, "#041E42", "#C8102E", "WSH")
wsh.knn.dist
And now with the neural network.
wsh.nnet.dist = make.nnet.dist.plot(wsh.nnet, "#041E42", "#C8102E", "WSH")
wsh.nnet.dist
Now let’s look at shot type versus goal probability.
wsh.knn.type = make.knn.type.plot(wsh.knn, "#041E42", "#C8102E", "WSH")
wsh.knn.type
And now with the neural network.
wsh.nnet.type = make.nnet.type.plot(wsh.nnet, "#041E42", "#C8102E", "WSH")
wsh.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
phi.nnet = get.team.data(nnet.prediction.data, "PHI")
phi.knn = get.team.data(knn.prediction.data, "PHI")
Let’s just visualize our shots type data.
phi.type.freq = make.type.freq.plot(phi.nnet, "#F74902", "#000000", "PHI")
phi.type.freq
Here is how the Flyers did with shots versus goal probability from the KNN model.
phi.knn.dist = make.knn.dist.plot(phi.knn, "#F74902", "#000000", "PHI")
phi.knn.dist
And now with the neural network.
phi.nnet.dist = make.nnet.dist.plot(phi.nnet, "#F74902", "#000000", "PHI")
phi.nnet.dist
Now let’s look at shot type versus goal probability.
phi.knn.type = make.knn.type.plot(phi.knn, "#F74902", "#000000", "PHI")
phi.knn.type
And now with the neural network.
phi.nnet.type = make.nnet.type.plot(phi.nnet, "#F74902", "#000000", "PHI")
phi.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
bos.nnet = get.team.data(nnet.prediction.data, "BOS")
bos.knn = get.team.data(knn.prediction.data, "BOS")
Let’s just visualize our shots type data.
bos.type.freq = make.type.freq.plot(bos.nnet, "#FFB81C", "#000000", "BOS")
bos.type.freq
Here is how the Bruins did with shots versus goal probability from the KNN model.
bos.knn.dist = make.knn.dist.plot(bos.knn, "#FFB81C", "#000000", "BOS")
bos.knn.dist
And now with the neural network.
bos.nnet.dist = make.nnet.dist.plot(bos.nnet, "#FFB81C", "#000000", "BOS")
bos.nnet.dist
Now let’s look at shot type versus goal probability.
bos.knn.type = make.knn.type.plot(bos.knn, "#FFB81C", "#000000", "BOS")
bos.knn.type
And now with the neural network.
bos.nnet.type = make.nnet.type.plot(bos.nnet, "#FFB81C", "#000000", "BOS")
bos.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
sjs.nnet = get.team.data(nnet.prediction.data, "S.J")
sjs.knn = get.team.data(knn.prediction.data, "S.J")
Let’s just visualize our shots type data.
sjs.type.freq = make.type.freq.plot(sjs.nnet, "#006D75", "#EA7200", "S.J")
sjs.type.freq
Here is how the Sharks did with shots versus goal probability from the KNN model.
sjs.knn.dist = make.knn.dist.plot(sjs.knn, "#006D75", "#EA7200", "S.J")
sjs.knn.dist
And now with the neural network.
sjs.nnet.dist = make.nnet.dist.plot(sjs.nnet, "#006D75", "#EA7200", "S.J")
sjs.nnet.dist
Now let’s look at shot type versus goal probability.
sjs.knn.type = make.knn.type.plot(sjs.knn, "#006D75", "#EA7200", "S.J")
sjs.knn.type
And now with the neural network.
sjs.nnet.type = make.nnet.type.plot(sjs.nnet, "#006D75", "#EA7200", "S.J")
sjs.nnet.type
Let’s get our data from the Neural Network and K-Nearest Neighbors algorithms.
dal.nnet = get.team.data(nnet.prediction.data, "DAL")
dal.knn = get.team.data(knn.prediction.data, "DAL")
Let’s just visualize our shots type data.
dal.type.freq = make.type.freq.plot(dal.nnet, "#006847", "#8F8F8C", "DAL")
dal.type.freq
Here is how the Stars did with shots versus goal probability from the KNN model.
dal.knn.dist = make.knn.dist.plot(dal.knn, "#006847", "#8F8F8C", "DAL")
dal.knn.dist
And now with the neural network.
dal.nnet.dist = make.nnet.dist.plot(dal.nnet, "#006847", "#8F8F8C", "DAL")
dal.nnet.dist
Now let’s look at shot type versus goal probability.
dal.knn.type = make.knn.type.plot(dal.knn, "#006847", "#8F8F8C", "DAL")
dal.knn.type
And now with the neural network.
dal.nnet.type = make.nnet.type.plot(dal.nnet, "#006847", "#8F8F8C", "DAL")
dal.nnet.type